First, we'll import some basic libraries, and and our csv dataset.

import numpy as np
import pandas as pd
import os
os.chdir("C:\\Users\\oshapira\\Desktop\\Analytics\\Uber\\data")
##import dataset and create data frame
df_raw = pd.read_csv('rideshare_kaggle.csv')

df = df_raw.copy()

Next, we'll filter on the column features that are of interest for our analysis. We will also drop any 'NA' values, although an alternative step would be to impute these values instead.

###initial filtering of columns desired for analysis
columns_to_keep = ['timestamp', 'hour', 'datetime', 'source', 'destination', 'cab_type', 'name', 'price', 'distance', 
                   'surge', 'temperature','precipIntensity', 'humidity', 'windSpeed']

df = df.filter(items = columns_to_keep)
df = df.dropna() #drop NAs

Data Manipulation

Clean up and convert time stamp variables

Some additional features I wanted to evaluate for this regression analysis included time of day, and day of week - neither of which were included in this dataset. These columns were derived by doing the following:

##create new datetime column that is in date_time format
df['datetime_2'] = pd.to_datetime(df.datetime, format="%Y-%m-%d %H:%M:%S") ##convert datetime to datetime format

##add variable for day of week for given ride
##sunday = 6, monday = 0
df['weekday'] = [x.weekday() for x in df['datetime_2']]

dict_weekday = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}

df['weekday'] = df['weekday'].map(dict_weekday).fillna(df['weekday'])


from datetime import datetime

####extract precise hour and minute of day from datetime variable
df['time']= [datetime.strptime(x, '%Y-%m-%d %H:%M:%S') for x in df['datetime']] ##create new timestamp column
df['time'] = [x.strftime("%H%M%S") for x in df['time']] #convert to string format
#df['time_2'] = [str(x) for x in df['time_2']]
df['time'] = [int(x) for x in df['time']] #convert to integer

Exploratory Data Analysis

import matplotlib.pyplot as plt
%matplotlib qt
import seaborn as sns
h, axes = plt.subplots (1,2, figsize=(12,4))


Ux=df.name[df.cab_type=='Uber'].unique()
Lx=df.name[df.cab_type=='Lyft'].unique()
Uy = df.name[df.name=='UberXL'].count(),df.name[df.name=='Black'].count(),\
     df.name[df.name=='UberX'].count(),df.name[df.name=='WAV'].count(),\
     df.name[df.name=='Black SUV'].count(),df.name[df.name=='UberPool'].count()

Ly=df.name[df.name=='Shared'].count(),df.name[df.name=='Lux'].count(),\
     df.name[df.name=='Lyft'].count(),df.name[df.name=='Lux Black XL'].count(),\
     df.name[df.name=='Lyft XL'].count(),df.name[df.name=='Lux Black'].count()
     
vis1= sns.barplot(Ux,np.array(Uy),palette='Accent',ax=axes[0])
vis2= sns.barplot(Lx,np.array(Ly),palette='Accent',ax=axes[1])


axes[0].set_title('Number of Uber Rides')
axes[1].set_title('Number of Lyft Rides')
plt.ioff()
C:\Users\oshapira\AppData\Local\Continuum\Anaconda3\lib\site-packages\seaborn\categorical.py:1428: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)

I decided to map the ride types into this data set to be 'Shared', 'Luxury', and 'Standard'. I figured this simplify my analysis while grouping together ride types that should have similar prices.

###create column for ride types so that Uber and Lyft can be more comparable
types=list(df.name.unique())

types_map = {'Shared':'Shared', 'Lux':'Luxury', 'Lyft':'Standard', 
             'Lux Black XL': 'Luxury', 'Lyft XL':'XL', 'Lux Black':'Luxury', 'UberXL':'XL', 
             'Black':'Luxury', 'UberX':'Standard', 'WAV':'Luxury', 'Black SUV':'Luxury', 'UberPool':'Shared', 
             'Taxi':'Luxury'}

df['ride_type'] = df['name'].map(types_map).fillna(df['name'])


df_sample = df.head(n = 100)

I will start by building a class, which includes a function for filter data based on my selected features of interest.

from sklearn.model_selection import train_test_split
from scipy import stats
import statsmodels.api as sm
from scipy.stats import f as fisher_f
from sklearn import metrics
from scipy import stats
from sklearn.linear_model import LinearRegression 
import numpy as np

class MyLinearRegression:    
    def __init__(self, features, cab_type, ride_type): #build instance
        self.features= features
        self.cab_type = cab_type
        self.ride_type = ride_type     
    def filter_data(self):
        self.features = self.features
#        features = self.features.insert(0, 'price')
#        global df
        df_filter = df.loc[(df['ride_type'].isin(self.ride_type)) & (df['cab_type'].isin(self.cab_type))]
        df_filter = df_filter.filter(items = self.features)
        if any('weekday' in s for s in df_filter.columns):
            df_encode = df_filter['weekday']
            df_filter = pd.concat([df_filter, pd.get_dummies(df_encode, prefix = 'Day', drop_first = True)], axis = 1)
            df_filter.drop(['weekday'], axis=1, inplace = True)
            df_filter = df_filter.reset_index(drop = True)
        else:
            df_filter = df_filter.reset_index(drop = True)                
        return df_filter

Next, I wanted to get a feel for the correlation between the features of interest I had selected. So I created a scatterplot matrix.

from pandas.plotting import scatter_matrix


def scatter_matrix_plot(variables, cab_type, ride_type):
    filtered_df = MyLinearRegression(variables, cab_type, ride_type)
    filtered_df = filtered_df.filter_data()
#     scatter_matrix(filtered_df, alpha=0.1, figsize=(len(filtered_df.columns), len(filtered_df.columns)), diagonal='kde')
    scatter_matrix(filtered_df, alpha=0.1, figsize=(15,15), diagonal='kde')
 
    return plt.show()

    
scatter_matrix_plot(['price','distance', 'temperature','precipIntensity', 'humidity', 'windSpeed', 'time'], ['Uber'], ['Shared'])

Building a Prediction Model

Since there are many functions I would have to repeatedly call for my model testing, I built a class to help keep my code organized.

from sklearn.model_selection import train_test_split
from scipy import stats
import statsmodels.api as sm
from scipy.stats import f as fisher_f
from sklearn import metrics
from scipy import stats
from sklearn.linear_model import LinearRegression 
import numpy as np

class MyLinearRegression:    
    def __init__(self, features, cab_type, ride_type): #build instance
        self.features= features
        self.cab_type = cab_type
        self.ride_type = ride_type     
    def filter_data(self):
        self.features = self.features
#        features = self.features.insert(0, 'price')
#        global df
        df_filter = df.loc[(df['ride_type'].isin(self.ride_type)) & (df['cab_type'].isin(self.cab_type))]
        df_filter = df_filter.filter(items = self.features)
        if any('weekday' in s for s in df_filter.columns):
            df_encode = df_filter['weekday']
            df_filter = pd.concat([df_filter, pd.get_dummies(df_encode, prefix = 'Day', drop_first = True)], axis = 1)
            df_filter.drop(['weekday'], axis=1, inplace = True)
            df_filter = df_filter.reset_index(drop = True)
        else:
            df_filter = df_filter.reset_index(drop = True)                
        return df_filter

    def split_data(self):
        df_filter = self.filter_data()
        X = df_filter.loc[:, df_filter.columns != 'price']
        Y = df_filter['price']
    #    X = sm.add_constant(X)
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
        return X_train, X_test, y_train, y_test

    def create_model(self):
        X_train, X_test, y_train, y_test = self.split_data()
        model_cab = LinearRegression()
        model_cab.fit(X_train, y_train)
        return model_cab

    def predict_model(self):
        X_train, X_test, y_train, y_test = self.split_data()
        return_model = self.create_model()
        fit_model = return_model.fit(X_train, y_train)
        predict = return_model.predict(X_test)
        return predict

    def linear_formula(self):
        X_train, X_test, y_train, y_test = self.split_data()
        return_model = self.create_model()
        fit_model = return_model.fit(X_train, y_train)
        slope = fit_model.coef_[0]
        intercept = fit_model.intercept_
        print(f"The linear regression line for " + "is y = " + str(round(slope,2))+ "x + " + str(round(intercept,2))) 

#    def full_linear_formula(self):
#        X_train, X_test, y_train, y_test = self.split_data()
#        return_model = self.create_model()
#        fit_model = return_model.fit(X_train, y_train)
#        slope = fit_model.coef_
#        intercept = fit_model.intercept_
#        return slope
#        print(f"The linear regression line for " + "is y = " + str(round(slope,2))+ "x + " + str(round(intercept,2))) 
     
    def metric_r2(self):
        X_train, X_test, y_train, y_test = self.split_data()
        return_model = self.create_model()
        fit_model = return_model.fit(X_train, y_train)
        r2_score = return_model.score(X_test, y_test)
        return round(r2_score,3)

    def metric_MAE(self):
        predicted = self.predict_model()        
        model_cab = self.create_model()
        X_train, X_test, y_train, y_test = self.split_data()
        model_cab.fit(X_train, y_train)
        MAE= metrics.mean_absolute_error(predicted, y_test)
        return round(MAE,3)

    def metric_MSE(self):
        predicted = self.predict_model()        
        model_cab = self.create_model()
        X_train, X_test, y_train, y_test = self.split_data()
        model_cab.fit(X_train, y_train)
        MAE= metrics.mean_squared_error(predicted, y_test)
        return round(MAE,3)

    def metric_RMSE(self):
        predicted = self.predict_model()        
        model_cab = self.create_model()
        X_train, X_test, y_train, y_test = self.split_data()
        model_cab.fit(X_train, y_train)
        MAE= np.sqrt(metrics.mean_squared_error(predicted, y_test))
        return round(MAE,3)
    
    def predict_model_train(self):
        X_train, X_test, y_train, y_test = self.split_data()
        return_model = self.create_model()
        fit_model = return_model.fit(X_train, y_train)
        predict = return_model.predict(X_train)
        return predict
    
    def metric_r2_train(self):
        X_train, X_test, y_train, y_test = self.split_data()
        return_model = self.create_model()
        fit_model = return_model.fit(X_train, y_train)
        r2_score = return_model.score(X_train, y_train)
        return round(r2_score,3)

    def metric_MAE_train(self):
        predicted = self.predict_model_train()        
        model_cab = self.create_model()
        X_train, X_test, y_train, y_test = self.split_data()
        model_cab.fit(X_train, y_train)
        MAE= metrics.mean_absolute_error(predicted, y_train)
        return round(MAE,3)

    def metric_MSE_train(self):
        predicted = self.predict_model_train()        
        model_cab = self.create_model()
        X_train, X_test, y_train, y_test = self.split_data()
        model_cab.fit(X_train, y_train)
        MAE= metrics.mean_squared_error(predicted, y_train)
        return round(MAE,3)

    def metric_RMSE_train(self):
        predicted = self.predict_model_train()        
        model_cab = self.create_model()
        X_train, X_test, y_train, y_test = self.split_data()
        model_cab.fit(X_train, y_train)
        MAE= np.sqrt(metrics.mean_squared_error(predicted, y_train))
        return round(MAE,3)

Since I suspect that several features will have an impact on Uber/Lyft prices, I wanted to build a multiple regression model. When doing so, I wanted to ensure that the variables included in this model have a statistically significant impact on price. Therefore, I built a function that inputs all features, removes any features that have a p-value higher than my alpha (in this case 0.05), and keeps repeating this process until none of the features left have a p-value > alpha. My final output will be a summary of the regression model and the variables left, using this "backwards elimination" technique.

def linear_params_test(variables, cab_type, ride_type, sigvalue):
    returned_df = MyLinearRegression(variables, cab_type,ride_type)
    df_filter = returned_df.filter_data()    
    X = df_filter.loc[:, df_filter.columns != 'price']
#    X = df_filter[variables]    
    Y = df_filter['price']
    X = sm.add_constant(X) ##statsmodels library does not add a constant by default
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
#    model = sm.OLS(y_train, X_train).fit()
#    print_model = model.summary()    
    lrmodel = sm.OLS(y_train, X_train).fit()
    pVals = lrmodel.pvalues
    pVals = pVals.to_frame().reset_index(drop = False)
    pVals.columns= ['index', 'pvalue']
    while max(pVals['pvalue'])>sigvalue:
        for p in range(0, len(pVals['pvalue'])): 
            p_value = pVals['pvalue'][p]
            i_max = pVals['pvalue'].values.argmax()
            column_drop = pVals['index'][i_max]
        X_train.drop([column_drop], axis = 1, inplace = True)
        X_test.drop([column_drop], axis = 1, inplace = True)
        lrmodel = sm.OLS(y_train,X_train).fit()
        pVals= lrmodel.pvalues
        pVals = pVals.to_frame().reset_index(drop = False)
        pVals.columns= ['index', 'pvalue']
    
    lrmodel = sm.OLS(y_train, X_train).fit()

#    summary = lrmodel.summary()
#    return summary
    return lrmodel


def linear_params_summary(variables, cab_type, ride_type, sigvalue):
    lrmodel = linear_params_test(variables, cab_type, ride_type, sigvalue)
    summary = lrmodel.summary()
    return summary
#jupyter nbextension enable --py widgetsnbextension
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from IPython.display import display
# button_output = widgets.Button(description = "Execute")
output = widgets.Output()
other_output = widgets.Output()



cab_type =widgets.SelectMultiple(
            options=['Uber', 'Lyft'],
            value=['Uber'],
            description='Cab Type',
            disabled=False
)

ride_type =widgets.SelectMultiple(
            options=['Shared', 'Luxury', 'Standard'],
            value=['Standard'],        
            description='Ride Type',
            disabled=False
)

sigvalue =widgets.FloatSlider(
            min = 0,
            max = 1,
            step = .05,
            description = 'Significance Value Alpha:',
            value = 0.05
)

def linear_backwards_elim(cab_type, ride_type, sigvalue):
    output.clear_output() 
    other_output.clear_output()
#     linear_params_test(variables, cab_type, ride_type, sigvalue)
#     variables = list(variables)
    cab_type = list(cab_type)
    ride_type = list(ride_type)
    with output:
        display(cab_type, ride_type, sigvalue)
    with other_output:
        variables = ['price','distance', 'temperature','precipIntensity', 'humidity', 'windSpeed', 'time']
        lrmodel = linear_params_test(variables, cab_type, ride_type, sigvalue)
        summary = lrmodel.summary()
        display(summary)
        


def cab_type_event(change):
#     variables = ['price','distance', 'temperature','precipIntensity', 'humidity', 'windSpeed', 'time']
    linear_backwards_elim(change.new, ride_type.value, sigvalue.value)
        
def ride_type_event(change):
#     variables = ['price','distance', 'temperature','precipIntensity', 'humidity', 'windSpeed', 'time']
    linear_backwards_elim(cab_type.value, change.new, sigvalue.value)
    
def sig_value_event(change):
#     variables = ['price','distance', 'temperature','precipIntensity', 'humidity', 'windSpeed', 'time']
    linear_backwards_elim(cab_type.value, ride_type.value, change.new)
        
        
# variables.observe(variables_event, names = 'value')
cab_type.observe(cab_type_event, names = 'value')
ride_type.observe(ride_type_event, names = 'value')
sigvalue.observe(sig_value_event, names = 'value')


# display(variables)
display(cab_type)
display(ride_type)
display(sigvalue)
# button_output.on_click(scatter_matrix_plot(variables, cab_type, ride_type))
# widgets.VBox([button_output, output])
display(output)
display(other_output)

Next, I tested out polynomial regressions.

############different degrees with regression

import statsmodels.formula.api as smf


def reg_params_poly(variables, cab_type, ride_type, degree):
    returned_df = MyLinearRegression(variables, cab_type,ride_type)
    df_filter = returned_df.split_data()   
#    return df_filter
    X_train = df_filter[0]['distance'].values
    X_test = df_filter[1]['distance'].values
    y_train = df_filter[2].values
    y_test = df_filter[3].values
    combined_df = pd.DataFrame(columns = ['y_train', 'x_train'])
    combined_df['y_train'] = y_train
    combined_df['X_tran'] = X_train
    weights = np.polyfit(X_train, y_train, degree)
    model = np.poly1d(weights)
    return model

def reg_poly_predict(variables, cab_type, ride_type, degree):
    returned_df = MyLinearRegression(variables, cab_type,ride_type)
    df_filter = returned_df.split_data()   
    X_train = df_filter[0]['distance'].values
    X_test = df_filter[1]['distance'].values
    y_train = df_filter[2].values
    y_test = df_filter[3].values
    model = reg_params_poly(variables, cab_type, ride_type, degree)
    predict = model(X_test)
    return predict

# predict= reg_poly_predict(['distance', 'price'],['Uber', 'Lyft'], ['Standard'],3)
# predict= reg_poly_predict(['distance', 'price'],['Uber', 'Lyft'], ['Standard'],2)


#from sklearn.metrics import mean_squared_error, r2_score

def reg_poly_r2(variables, cab_type, ride_type, degree):
    returned_df = MyLinearRegression(variables, cab_type,ride_type)
    df_filter = returned_df.split_data()   
    X_train = df_filter[0]['distance'].values
    X_test = df_filter[1]['distance'].values
    y_train = df_filter[2].values
    y_test = df_filter[3].values
    predict = reg_poly_predict(variables, cab_type, ride_type, degree)
    r2 = round(metrics.r2_score(y_test, predict),3)
    return r2

# reg_poly_r2(['distance', 'price'],['Uber', 'Lyft'], ['Standard'],2)
# reg_poly_r2(['distance', 'price'],['Uber', 'Lyft'], ['Standard'],3)
# reg_poly_r2(['distance', 'price'],['Uber', 'Lyft'], ['Standard'],4)



def reg_poly_MAE(variables, cab_type, ride_type, degree):
    returned_df = MyLinearRegression(variables, cab_type,ride_type)
    df_filter = returned_df.split_data()   
    X_train = df_filter[0]['distance'].values
    X_test = df_filter[1]['distance'].values
    y_train = df_filter[2].values
    y_test = df_filter[3].values
    predict = reg_poly_predict(variables, cab_type, ride_type, degree)
    MAE = round(metrics.mean_absolute_error(y_test, predict),3)
    return MAE

# reg_poly_MAE(['distance', 'price'],['Uber', 'Lyft'], ['Standard'],3)

def reg_poly_MSE(variables, cab_type, ride_type, degree):
    returned_df = MyLinearRegression(variables, cab_type,ride_type)
    df_filter = returned_df.split_data()   
    X_train = df_filter[0]['distance'].values
    X_test = df_filter[1]['distance'].values
    y_train = df_filter[2].values
    y_test = df_filter[3].values
    predict = reg_poly_predict(variables, cab_type, ride_type, degree)
    MSE = round(metrics.mean_squared_error(y_test, predict),3)
    return MSE

# reg_poly_MSE(['distance', 'price'],['Uber', 'Lyft'], ['Standard'],3)

def reg_poly_RMSE(variables, cab_type, ride_type, degree):
    returned_df = MyLinearRegression(variables, cab_type,ride_type)
    df_filter = returned_df.split_data()   
    X_train = df_filter[0]['distance'].values
    X_test = df_filter[1]['distance'].values
    y_train = df_filter[2].values
    y_test = df_filter[3].values
    predict = reg_poly_predict(variables, cab_type, ride_type, degree)
    RMSE = round(np.sqrt(metrics.mean_squared_error(y_test, predict)),3)
    return RMSE



def reg_poly_predict_train(variables, cab_type, ride_type, degree):
    returned_df = MyLinearRegression(variables, cab_type,ride_type)
    df_filter = returned_df.split_data()   
    X_train = df_filter[0]['distance'].values
    X_test = df_filter[1]['distance'].values
    y_train = df_filter[2].values
    y_test = df_filter[3].values
    model = reg_params_poly(variables, cab_type, ride_type, degree)
    predict = model(X_train)
    return predict

# predict= reg_poly_predict(['distance', 'price'],['Uber', 'Lyft'], ['Standard'],3)
# predict= reg_poly_predict(['distance', 'price'],['Uber', 'Lyft'], ['Standard'],2)


#from sklearn.metrics import mean_squared_error, r2_score

def reg_poly_r2_train(variables, cab_type, ride_type, degree):
    returned_df = MyLinearRegression(variables, cab_type,ride_type)
    df_filter = returned_df.split_data()   
    X_train = df_filter[0]['distance'].values
    X_test = df_filter[1]['distance'].values
    y_train = df_filter[2].values
    y_test = df_filter[3].values
    predict = reg_poly_predict_train(variables, cab_type, ride_type, degree)
    r2 = round(metrics.r2_score(y_train, predict),3)
    return r2

# reg_poly_r2(['distance', 'price'],['Uber', 'Lyft'], ['Standard'],2)
# reg_poly_r2(['distance', 'price'],['Uber', 'Lyft'], ['Standard'],3)
# reg_poly_r2(['distance', 'price'],['Uber', 'Lyft'], ['Standard'],4)



def reg_poly_MAE_train(variables, cab_type, ride_type, degree):
    returned_df = MyLinearRegression(variables, cab_type,ride_type)
    df_filter = returned_df.split_data()   
    X_train = df_filter[0]['distance'].values
    X_test = df_filter[1]['distance'].values
    y_train = df_filter[2].values
    y_test = df_filter[3].values
    predict = reg_poly_predict_train(variables, cab_type, ride_type, degree)
    MAE = round(metrics.mean_absolute_error(y_train, predict),3)
    return MAE

# reg_poly_MAE(['distance', 'price'],['Uber', 'Lyft'], ['Standard'],3)

def reg_poly_MSE_train(variables, cab_type, ride_type, degree):
    returned_df = MyLinearRegression(variables, cab_type,ride_type)
    df_filter = returned_df.split_data()   
    X_train = df_filter[0]['distance'].values
    X_test = df_filter[1]['distance'].values
    y_train = df_filter[2].values
    y_test = df_filter[3].values
    predict = reg_poly_predict_train(variables, cab_type, ride_type, degree)
    MSE = round(metrics.mean_squared_error(y_train, predict),3)
    return MSE

reg_poly_MSE(['distance', 'price'],['Uber', 'Lyft'], ['Standard'],3)

def reg_poly_RMSE_train(variables, cab_type, ride_type, degree):
    returned_df = MyLinearRegression(variables, cab_type,ride_type)
    df_filter = returned_df.split_data()   
    X_train = df_filter[0]['distance'].values
    X_test = df_filter[1]['distance'].values
    y_train = df_filter[2].values
    y_test = df_filter[3].values
    predict = reg_poly_predict_train(variables, cab_type, ride_type, degree)
    RMSE = round(np.sqrt(metrics.mean_squared_error(y_train, predict)),3)
    return RMSE

Then, I peformed the Chow test

###########chow test

from scipy.stats import f as fisher_f
def chow_test(alpha_val, ride_type):

    ##get dataset for both Uber + Lyft
    all_lin_reg_simple = MyLinearRegression(['price','distance', ], ['Uber', 'Lyft'],['Standard'])
    y_test_all = all_lin_reg_simple.predict_model()
    predict_all = np.asarray(all_lin_reg_simple.split_data()[3])
    
    ##Predictions for Uber Only
    uber_lin_reg_simple = MyLinearRegression(['price','distance', ], ['Uber'],['Standard'])
    y_test_uber = uber_lin_reg_simple.predict_model()
    predict_uber = np.asarray(uber_lin_reg_simple.split_data()[3])
    ##Predictions for Lyft Only
    lyft_lin_reg_simple = MyLinearRegression(['price','distance', ], ['Lyft'],['Standard'])
    y_test_lyft = lyft_lin_reg_simple.predict_model()
    predict_lyft = np.asarray(lyft_lin_reg_simple.split_data()[3])
    
    ##Add up Sum of Squared Errors
    SSE_all = sum((np.asarray(y_test_all - predict_all))*(np.asarray(y_test_all - predict_all)))
    SSE_lyft = sum((np.asarray(y_test_lyft - predict_lyft))*(np.asarray(y_test_lyft - predict_lyft)))
    SSE_uber = sum((np.asarray(y_test_uber - predict_uber))*(np.asarray(y_test_uber - predict_uber)))

    N = len(y_test_all)
    deg_freedom_all = len(y_test_all) - 1 
    deg_freedom_1_lyft = len(y_test_lyft) - 1
    deg_freedom_2_uber = len(y_test_uber) - 1
    
    k = 2 # one dimensional regression - slope and intercept
    numerator = (SSE_all - (SSE_lyft + SSE_uber))/k
    denominator = (SSE_lyft + SSE_uber)/(N - 2*k) #both regression have 2 degrees of freedom
    
    f_statistics = numerator/denominator #calculate f-statistic significant value for each months' regressions
    
    alpha = alpha_val
    p_value = fisher_f.cdf(f_statistics, 2, N-2*2) ##calculate critical value for degrees 2 and 2
    
    if p_value > alpha:
        chow_test = str('The Chow test shows that there is a statistically significant difference between the Uber and Lyft regression lines')
    else:
        chow_test = str('The Chow test shows that there is not a statistically significant difference between the Uber and Lyft regression lines')
    
    
    return chow_test
chow_test(.05,['Standard'])
'The Chow test shows that there is not a statistically significant difference between the Uber and Lyft regression lines'
# jupyter nbextension enable --py widgetsnbextension
from plotly.offline import init_notebook_mode, iplot
import plotly.express as px
from plotly.offline import plot
import plotly.graph_objs as go


def lin_reg_cab_df(cab_type, ride_type):
    
    features = ['price', 'distance', 'ride_type', 'cab_type']
    df_filter = df.filter(items = features) 
    df_filter = df_filter.loc[(df_filter['ride_type'].isin(ride_type))]
    df_filter_cab = df_filter.loc[(df_filter['cab_type'] == cab_type)]

    X_cab = df_filter_cab.loc[:,df_filter_cab.columns == 'distance']
    Y_cab = df_filter_cab.loc[:,df_filter_cab.columns == 'price']

    X_train_cab, X_test_cab, y_train_cab, y_test_cab = train_test_split(X_cab, Y_cab, test_size = 0.2, random_state = 0)
    
    model_cab = LinearRegression()
    model_cab.fit(X_train_cab, y_train_cab)
    
    X_test_cab_df = pd.DataFrame(X_test_cab['distance'].reset_index(drop = True))
    y_test_cab_df = pd.DataFrame(y_test_cab['price'].reset_index(drop = True))
    cab_predicted = pd.DataFrame(model_cab.predict(X_test_cab))
    
    cab_df = pd.concat([X_test_cab_df, y_test_cab_df, cab_predicted], axis = 1)
    cab_df.columns = ['distance', 'actual_price', 'predicted_price']
    
    return cab_df




import plotly.express as px
from plotly.offline import plot
import plotly.graph_objs as go

def plot_lin_reg(ride_type):
    uber_df = lin_reg_cab_df('Uber', ride_type)
    lyft_df = lin_reg_cab_df('Lyft', ride_type)
#     r_model = MyLinearRegression(variables, cab_type, ride_type)
#     formula = r_model.linear_formula()
#     print(formula)

    trace0 = go.Scatter(
            x = uber_df['distance'], 
            y  = uber_df['actual_price'], 
            mode = 'markers',
            marker = dict(opacity = 0.5),
            name = 'Uber Actual Prices'
            )
    trace1 = go.Scatter(
            x = lyft_df['distance'], 
            y  = lyft_df['actual_price'], 
            mode = 'markers',
            marker = dict(opacity = 0.5),
            name = 'Lyft Actual Prices',
            )
    
    trace2 = go.Scatter(
            x = uber_df['distance'], 
            y  = uber_df['predicted_price'], 
            mode = 'lines',
            name = 'Uber Regression Line'
            )
    
    trace3 = go.Scatter(
            x = lyft_df['distance'], 
            y  = lyft_df['predicted_price'], 
            mode = 'lines',
            name = 'Lyft Regression Line'
            )

    data = [trace0, trace1, trace2, trace3]
    return iplot(data)


plot_lin_reg(['Standard'])

Then I plotted learning curve

def plot_learning_curves(model, X, y):
    reg_simple_df = MyLinearRegression(['price','distance'], ['Uber'],['Standard'])
    X_train, X_val, y_train, y_val = reg_simple_df.split_data()
    train_errors, val_errors = [], []
    for m in range(1, 500):
        model.fit(X_train[:m], y_train[:m])
        y_train_predict = model.predict(X_train[:m])
        y_val_predict = model.predict(X_val)
        train_errors.append(metrics.mean_squared_error(y_train[:m], y_train_predict))
        val_errors.append(metrics.mean_squared_error(y_val, y_val_predict))
    plt.figure(figsize = (15,9))
    plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
    plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
    plt.title('Learning Curve')
    plt.xlabel('Number of Samples')
    plt.ylabel('RMSE')
    plt.legend(loc = 'upper right')
    
#     plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
#     plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
    return plt.plot()
X = df['distance']
y = df['price']
lin_reg = LinearRegression()
plot_learning_curves(lin_reg, X, y)
[]
def plot_learning_curves_multi(model, X, y):
    reg_simple_df = MyLinearRegression(['price','distance', 'temperature','precipIntensity', 'humidity', 'windSpeed', 'time'], ['Uber'],['Standard'])
    X_train, X_val, y_train, y_val = reg_simple_df.split_data()
    train_errors, val_errors = [], []
    for m in range(1, 500):
        model.fit(X_train[:m], y_train[:m])
        y_train_predict = model.predict(X_train[:m])
        y_val_predict = model.predict(X_val)
        train_errors.append(metrics.mean_squared_error(y_train[:m], y_train_predict))
        val_errors.append(metrics.mean_squared_error(y_val, y_val_predict))
    plt.figure(figsize = (15,9))
    plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
    plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
    plt.title('Learning Curve')
    plt.xlabel('Number of Samples')
    plt.ylabel('RMSE')
    plt.legend(loc = 'upper right')
    return plt.plot()
X = df[['distance', 'temperature','precipIntensity', 'humidity', 'windSpeed', 'time']]
y = df['price']
lin_reg = LinearRegression()
plot_learning_curves_multi(lin_reg, X, y)
[]
def plot_learning_curves_poly(X, y):
    reg_simple_df = MyLinearRegression(['price','distance'], ['Uber'],['Standard'])
    X_train, X_val, y_train, y_val = reg_simple_df.split_data()
    df_filter = reg_simple_df.split_data()   
    X_train = df_filter[0]['distance'].values
    X_test = df_filter[1]['distance'].values
    y_train = df_filter[2].values
    y_test = df_filter[3].values
    combined_df = pd.DataFrame(columns = ['y_train', 'x_train'])
    combined_df['y_train'] = y_train
    combined_df['X_tran'] = X_train
    train_errors, val_errors = [], []
    for m in range(1, 500):
        model = np.polyfit(X_train[:m], y_train[:m],3)
        model = np.poly1d(model)
        y_train_predict = model(X_train[:m])
        y_val_predict = model(X_val)
        train_errors.append(metrics.mean_squared_error(y_train[:m], y_train_predict))
        val_errors.append(metrics.mean_squared_error(y_val, y_val_predict))
    plt.figure(figsize = (15,9))
    plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
    plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
    plt.title('Learning Curve')
    plt.xlabel('Number of Samples')
    plt.ylabel('RMSE')
    plt.legend(loc = 'upper right')
    return plt.plot()
X = df['distance']
y = df['price']
degree = 3

# poly_reg = LinearRegression()

plot_learning_curves_poly(X, y)

#     weights = np.polyfit(X_train, y_train, degree)
#     model = np.poly1d(weights)

#     model = reg_params_poly(variables, cab_type, ride_type, degree)
#     predict = model(X_test)
C:\Users\oshapira\AppData\Local\Continuum\Anaconda3\lib\site-packages\ipykernel_launcher.py:7: RankWarning:

Polyfit may be poorly conditioned

C:\Users\oshapira\AppData\Local\Continuum\Anaconda3\lib\site-packages\ipykernel_launcher.py:7: RankWarning:

Polyfit may be poorly conditioned

C:\Users\oshapira\AppData\Local\Continuum\Anaconda3\lib\site-packages\ipykernel_launcher.py:7: RankWarning:

Polyfit may be poorly conditioned

[]

Then I aggregated the performance metrics of all the linear models I tested.

def performance_metrics(cab_type, ride_type):
    #metrics on testing validation set
    reg_simple_list = ['Simple Linear Regression']
    reg_simple = MyLinearRegression(['price','distance'], cab_type,ride_type)
    reg_simple_list.append(reg_simple.metric_r2())
    reg_simple_list.append(reg_simple.metric_MAE())
    reg_simple_list.append(reg_simple.metric_MSE())
    reg_simple_list.append(reg_simple.metric_RMSE())
    ##metrics on training dataset
    reg_simple_list.append(reg_simple.metric_r2_train())
    reg_simple_list.append(reg_simple.metric_MAE_train())
    reg_simple_list.append(reg_simple.metric_MSE_train())
    reg_simple_list.append(reg_simple.metric_RMSE_train())
    simple_df = pd.DataFrame(np.asarray(reg_simple_list).reshape(1,9), columns = ['Regression Type','r2', 'MAE', 'MSE', 'RMSE','r2_train', 'MAE_train', 'MSE_train', 'RMSE_train'])
    

    reg_multi_list = ['Multiple Linear Regression']
    reg_multi = MyLinearRegression(['price','distance', 'temperature','precipIntensity', 'humidity', 'windSpeed', 'time', 'weekday'], cab_type,ride_type)
    reg_multi_list.append(reg_multi.metric_r2())
    reg_multi_list.append(reg_multi.metric_MAE())
    reg_multi_list.append(reg_multi.metric_MSE())
    reg_multi_list.append(reg_multi.metric_RMSE())
    
    reg_multi_list.append(reg_multi.metric_r2_train())
    reg_multi_list.append(reg_multi.metric_MAE_train())
    reg_multi_list.append(reg_multi.metric_MSE_train())
    reg_multi_list.append(reg_multi.metric_RMSE_train())
    multi_df = pd.DataFrame(np.asarray(reg_multi_list).reshape(1,9), columns = ['Regression Type','r2', 'MAE', 'MSE', 'RMSE','r2_train', 'MAE_train', 'MSE_train', 'RMSE_train'])
    
    reg_quad_list = ['Quadratic Regression']    
    reg_quad_list.append(reg_poly_r2(['price','distance'], cab_type,ride_type, 2))
    reg_quad_list.append(reg_poly_MAE(['price','distance'], cab_type,ride_type, 2))
    reg_quad_list.append(reg_poly_MSE(['price','distance'], cab_type,ride_type, 2))
    reg_quad_list.append(reg_poly_RMSE(['price','distance'], cab_type,ride_type, 2))
    
    reg_quad_list.append(reg_poly_r2_train(['price','distance'], cab_type,ride_type, 2))
    reg_quad_list.append(reg_poly_MAE_train(['price','distance'], cab_type,ride_type, 2))
    reg_quad_list.append(reg_poly_MSE_train(['price','distance'], cab_type,ride_type, 2))
    reg_quad_list.append(reg_poly_RMSE_train(['price','distance'], cab_type,ride_type, 2))
    quad_df = pd.DataFrame(np.asarray(reg_quad_list).reshape(1,9), columns = ['Regression Type','r2', 'MAE', 'MSE', 'RMSE','r2_train', 'MAE_train', 'MSE_train', 'RMSE_train'])

    reg_poly_list = ['3-Degree Polynomial Regression']    
    reg_poly_list.append(reg_poly_r2(['price','distance'], cab_type,ride_type, 3))
    reg_poly_list.append(reg_poly_MAE(['price','distance'], cab_type,ride_type, 3))
    reg_poly_list.append(reg_poly_MSE(['price','distance'], cab_type,ride_type, 3))
    reg_poly_list.append(reg_poly_RMSE(['price','distance'], cab_type,ride_type, 3))
    
    reg_poly_list.append(reg_poly_r2_train(['price','distance'], cab_type,ride_type, 3))
    reg_poly_list.append(reg_poly_MAE_train(['price','distance'], cab_type,ride_type, 3))
    reg_poly_list.append(reg_poly_MSE_train(['price','distance'], cab_type,ride_type, 3))
    reg_poly_list.append(reg_poly_RMSE_train(['price','distance'], cab_type,ride_type, 3))
    poly_df = pd.DataFrame(np.asarray(reg_poly_list).reshape(1,9), columns = ['Regression Type','r2', 'MAE', 'MSE', 'RMSE','r2_train', 'MAE_train', 'MSE_train', 'RMSE_train'])

    all_dfs = [simple_df, multi_df, quad_df, poly_df]
    all_results_df = pd.concat(all_dfs).reset_index(drop = True)
    return all_results_df
performance_metrics(['Uber', 'Lyft'], ['Standard'])

performance_metrics(['Uber'], ['Standard'])
Regression Type r2 MAE MSE RMSE r2_train MAE_train MSE_train RMSE_train
0 Simple Linear Regression 0.541 1.07 2.903 1.704 0.54 1.052 2.765 1.663
1 Multiple Linear Regression 0.54 1.07 2.903 1.704 0.54 1.052 2.764 1.663
2 Quadratic Regression 0.541 1.07 2.902 1.703 0.54 1.052 2.764 1.663
3 3-Degree Polynomial Regression 0.542 1.07 2.891 1.7 0.542 1.052 2.758 1.661
######################KNN REGRESSION##################
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn import neighbors


class MyKnnRegression:    
    def __init__(self, features, cab_type, ride_type, K): #build instance
        self.features= features
        self.cab_type = cab_type
        self.ride_type = ride_type  
        self.K = K
    def filter_data(self):
        self.features = self.features
#        features = self.features.insert(0, 'price')
#        global df
        df_filter = df.loc[(df['ride_type'].isin(self.ride_type)) & (df['cab_type'].isin(self.cab_type))]
        df_filter = df_filter.filter(items = self.features)
        if any('weekday' in s for s in df_filter.columns):
            df_encode = df_filter['weekday']
            df_filter = pd.concat([df_filter, pd.get_dummies(df_encode, prefix = 'Day', drop_first = True)], axis = 1)
            df_filter.drop(['weekday'], axis=1, inplace = True)
            df_filter = df_filter.reset_index(drop = True)
        else:
            df_filter = df_filter.reset_index(drop = True)                
        return df_filter

    def split_data(self):
        df_filter = self.filter_data()
        X = df_filter.loc[:, df_filter.columns != 'price']
        Y = df_filter['price']
    #    X = sm.add_constant(X)
        scaler = StandardScaler()
        scaler.fit(X)
        X = scaler.transform(X)
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
        return X_train, X_test, y_train, y_test

    def create_model(self):
        X_train, X_test, y_train, y_test = self.split_data()
        model_cab = neighbors.KNeighborsRegressor(n_neighbors = self.K)
        model_cab.fit(X_train, y_train)
        return model_cab


    def predict_model(self):
        X_train, X_test, y_train, y_test = self.split_data()
        return_model = self.create_model()
        fit_model = return_model.fit(X_train, y_train)
        predict = return_model.predict(X_test)
        return predict

#     def linear_formula(self):
#         X_train, X_test, y_train, y_test = self.split_data()
#         return_model = self.create_model()
#         fit_model = return_model.fit(X_train, y_train)
#         slope = fit_model.coef_[0]
#         intercept = fit_model.intercept_
#         print(f"The linear regression line for " + "is y = " + str(round(slope,2))+ "x + " + str(round(intercept,2))) 


    def metric_r2(self):
        X_train, X_test, y_train, y_test = self.split_data()
        return_model = self.create_model()
        fit_model = return_model.fit(X_train, y_train)
        r2_score = return_model.score(X_test, y_test)
        return round(r2_score,3)

    def metric_MAE(self):
        predicted = self.predict_model()        
        model_cab = self.create_model()
        X_train, X_test, y_train, y_test = self.split_data()
        model_cab.fit(X_train, y_train)
        MAE= metrics.mean_absolute_error(predicted, y_test)
        return round(MAE,3)

    def metric_MSE(self):
        predicted = self.predict_model()        
        model_cab = self.create_model()
        X_train, X_test, y_train, y_test = self.split_data()
        model_cab.fit(X_train, y_train)
        MAE= metrics.mean_squared_error(predicted, y_test)
        return round(MAE,3)

    def metric_RMSE(self):
        predicted = self.predict_model()        
        model_cab = self.create_model()
        X_train, X_test, y_train, y_test = self.split_data()
        model_cab.fit(X_train, y_train)
        MAE= np.sqrt(metrics.mean_squared_error(predicted, y_test))
        return round(MAE,3)

    
    def predict_model_train(self):
        X_train, X_test, y_train, y_test = self.split_data()
        return_model = self.create_model()
        fit_model = return_model.fit(X_train, y_train)
        predict = return_model.predict(X_train)
        return predict
    
    def metric_r2_train(self):
        X_train, X_test, y_train, y_test = self.split_data()
        return_model = self.create_model()
        fit_model = return_model.fit(X_train, y_train)
        r2_score = return_model.score(X_train, y_train)
        return round(r2_score,3)

    def metric_MAE_train(self):
        predicted = self.predict_model_train()        
        model_cab = self.create_model()
        X_train, X_test, y_train, y_test = self.split_data()
        model_cab.fit(X_train, y_train)
        MAE= metrics.mean_absolute_error(predicted, y_train)
        return round(MAE,3)

    def metric_MSE_train(self):
        predicted = self.predict_model_train()        
        model_cab = self.create_model()
        X_train, X_test, y_train, y_test = self.split_data()
        model_cab.fit(X_train, y_train)
        MAE= metrics.mean_squared_error(predicted, y_train)
        return round(MAE,3)

    def metric_RMSE_train(self):
        predicted = self.predict_model_train()        
        model_cab = self.create_model()
        X_train, X_test, y_train, y_test = self.split_data()
        model_cab.fit(X_train, y_train)
        MAE= np.sqrt(metrics.mean_squared_error(predicted, y_train))
        return round(MAE,3)
    
    
test = MyKnnRegression(['price','distance'], ['Uber'],['Standard'],3)
test = test.predict_model_train()
test
array([ 7.33333333, 12.66666667,  8.16666667, ..., 13.16666667,
        8.66666667,  7.83333333])
from sklearn.metrics import mean_squared_error 
from math import sqrt
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import plot

def knn_regression_filter(variables, cab_type, ride_type):
    rmse_val = [] #to store rmse values for different k
    k_list = []
    for K in range(3,21,2):
        K = K
        returned_knn = MyKnnRegression(variables, cab_type, ride_type, K)
        X_train, X_test, y_train, y_test = returned_knn.split_data()

        error = returned_knn.metric_RMSE()
        rmse_val.append(error) #store rmse values
        k_list.append(K)
#        printed = print('RMSE value for k= ' , K , 'is:', error, 'and R-square is ', str(round(r2,2)))
    curve = pd.DataFrame(rmse_val,k_list).reset_index(drop = False) #elbow curve 
    curve.columns = ['k_value', 'rmse']
    k_array = np.asarray(k_list)
    rmse_val_array = np.asarray(rmse_val)
    
    fig = go.Figure(data = go.Scatter(x= curve['k_value'], y=curve['rmse']))
    fig.update_layout(
            title = 'Accuracy per number of K Neighbors',
            xaxis = dict(
                    tick0 = 3,
                    dtick = 2,
                    title_text = '# of Neighbors'
                    ),
            yaxis = dict(
                    title_text = 'Root Mean Squared Error')
                )
        
    return iplot(fig)


knn_regression_filter(['price','distance', 'temperature','precipIntensity', 'humidity', 'windSpeed', 'time', 'weekday'], ['Uber', 'Lyft'], ['Standard'])
def performance_metrics(cab_type, ride_type):
    #metrics on testing validation set
    reg_simple_list = ['KNN Simple Linear Regression']
    reg_simple = MyKnnRegression(['price','distance'], cab_type,ride_type,7)
    reg_simple_list.append(reg_simple.metric_r2())
    reg_simple_list.append(reg_simple.metric_MAE())
    reg_simple_list.append(reg_simple.metric_MSE())
    reg_simple_list.append(reg_simple.metric_RMSE())
    ##metrics on training dataset
    reg_simple_list.append(reg_simple.metric_r2_train())
    reg_simple_list.append(reg_simple.metric_MAE_train())
    reg_simple_list.append(reg_simple.metric_MSE_train())
    reg_simple_list.append(reg_simple.metric_RMSE_train())
    simple_df = pd.DataFrame(np.asarray(reg_simple_list).reshape(1,9), columns = ['Regression Type','r2', 'MAE', 'MSE', 'RMSE','r2_train', 'MAE_train', 'MSE_train', 'RMSE_train'])
    

    reg_multi_list = ['KNN Multiple Regression']
    reg_multi = MyKnnRegression(['price','distance', 'temperature','precipIntensity', 'humidity', 'windSpeed', 'time', 'weekday'], cab_type,ride_type, 7)
    reg_multi_list.append(reg_multi.metric_r2())
    reg_multi_list.append(reg_multi.metric_MAE())
    reg_multi_list.append(reg_multi.metric_MSE())
    reg_multi_list.append(reg_multi.metric_RMSE())
    
    reg_multi_list.append(reg_multi.metric_r2_train())
    reg_multi_list.append(reg_multi.metric_MAE_train())
    reg_multi_list.append(reg_multi.metric_MSE_train())
    reg_multi_list.append(reg_multi.metric_RMSE_train())
    multi_df = pd.DataFrame(np.asarray(reg_multi_list).reshape(1,9), columns = ['Regression Type','r2', 'MAE', 'MSE', 'RMSE','r2_train', 'MAE_train', 'MSE_train', 'RMSE_train'])
    
    all_dfs = [simple_df, multi_df]
    all_results_df = pd.concat(all_dfs).reset_index(drop = True)
    return all_results_df
performance_metrics(['Uber', 'Lyft'], ['Standard'])
Regression Type r2 MAE MSE RMSE r2_train MAE_train MSE_train RMSE_train
0 KNN Simple Linear Regression 0.551 1.027 2.787 1.669 0.561 1.029 2.743 1.656
1 KNN Multiple Regression 0.518 1.098 2.99 1.729 0.644 0.951 2.225 1.492
def plot_learning_curves_knn(X, y, degrees):
    reg_simple_df = MyKnnRegression(['price','distance'], ['Uber'],['Standard'],degrees)
    X_train, X_val, y_train, y_val = reg_simple_df.split_data()
    train_errors, val_errors = [], []
    for m in range(1, 500):
#         model_cab = neighbors.KNeighborsRegressor(n_neighbors = self.K)
#         model_cab.fit(X_train, y_train)
        model = neighbors.KNeighborsRegressor(n_neighbors = degrees)
        model.fit(X_train[:degrees+m], y_train[:degrees+m])
        y_train_predict = model.predict(X_train[:degrees+m])
        y_val_predict = model.predict(X_val)
        train_errors.append(metrics.mean_squared_error(y_train[:degrees+m], y_train_predict))
        val_errors.append(metrics.mean_squared_error(y_val, y_val_predict))
    plt.figure(figsize = (15,9))
    plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
    plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
    plt.title('Learning Curve')
    plt.xlabel('Number of Samples')
    plt.ylabel('RMSE')
    plt.legend(loc = 'upper right')
    return plt.plot()
X = df['distance']
y = df['price']
# knn_reg = neighbors.KNeighborsRegressor(n_neighbors = degrees)
# plot_learning_curves_knn(X, y, 3)
plot_learning_curves_knn(X, y, 7)
[]